#script to match GHGRP refineries facilities with NEI

#GHGRP and NEI can be matched by these things: facility name and distance (LAT, LON)
#I rank matching perfection by match_qual_flag
#match_qual_flag = 0: matched by  facility name and distance sanity check
#match_qual_flag = 1: mutually closest and distance sanity check          
#match_qual_flag = 2: GHGRP's closest NEI and distance sanity check

#############################################################################################################################
#install libraries
#install.packages("plyr")
library(plyr)
#install.packages("dplyr")
library(dplyr)
#install.packages("readr")
library(readr)
#install.packages("psych")
library(psych)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("ggpubr")
library(ggpubr)
#install.packages("geosphere")
library(geosphere)

#############################################################################################################################
#set working directory
setwd ("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_GHGRP_NEI_IND/spreadsheets/output")

#############################################################################################################################
#read prepared .csv files
GHGRP <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_GHGRP_NEI_IND/spreadsheets/input/GHGRP_refineries_2017_CO2_FC.csv")
NEI <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_GHGRP_NEI_IND/spreadsheets/input/NEI_refineries_facilities.csv")

#############################################################################################################################
#rename columns to make it less confusing for later coding
colnames(GHGRP)[colnames(GHGRP) == "FACILITY.NAME"] ="GHGRP_FACILITY_NAME"
colnames(GHGRP)[colnames(GHGRP) == "LONGITUDE"] ="LON_GHGRP"
colnames(GHGRP)[colnames(GHGRP) == "LATITUDE"] ="LAT_GHGRP"
colnames(GHGRP)[colnames(GHGRP) == "GHGRP.ID"] ="GHGRP_ID"

colnames(NEI)[colnames(NEI) == "Average.of.LONGITUDE"] ="LON_NEI"
colnames(NEI)[colnames(NEI) == "Average.of.LATITUDE"] ="LAT_NEI"
colnames(NEI)[colnames(NEI) == "FACILITY_ID"] ="NEI_FACILITY_ID"
colnames(NEI)[colnames(NEI) == "FACILITY_NAME"] ="NEI_FACILITY_NAME"

#############################################################################################################################
#remove AK and HI GHGRP points
GHGRP = GHGRP %>% filter(STATE != 'AK' & STATE != 'HI' & STATE != 'PR')
GHGRP = subset(GHGRP, select=c("GHGRP_ID","GHGRP_FACILITY_NAME","LON_GHGRP","LAT_GHGRP"))

#############################################################################################################################
#remove NEI points that have no lat lon
NEI = NEI %>% filter(!is.na(LON_NEI) & !is.na(LAT_NEI))

#############################################################################################################################
#study GHGRP data
GHGRP_ID_uniq = GHGRP %>% distinct(GHGRP_ID,.keep_all=TRUE)
GHGRP_FACILITY_NAME_uniq = GHGRP %>% distinct(GHGRP_FACILITY_NAME,.keep_all=TRUE)

#############################################################################################################################
#study NEI data
NEI_FACILITY_ID_uniq = NEI %>% distinct(NEI_FACILITY_ID, .keep_all=TRUE)
NEI_FACILITY_NAME_uniq = NEI %>% distinct(NEI_FACILITY_NAME, .keep_all=TRUE)

#some different facility IDs share the same name:
#it is possible that NEI_FACILITY_IDs share the same FACILITY_NAME locate at different locations that cannot be treated as the same location, distance > distance_buffer
#it is also possible that NEI_FACILITY_IDs share the same FACILITY_NAME locate at different locations that can be treated as the same location, distance <= distance_buffer
#in this case, GHGRP emissions should be distributed to all these NEI_FACILITY_IDs

#############################################################################################################################
#match_qual_flag = 0: matched by  facility name and distance sanity check

##########################
#prepare NEI_match0_pool array
NEI_match0_pool = NEI

#matching each row of GHGRP with NEI_match0_pool by FACILITY_NAME
max_mflg0_NEI_FACILITY_ID = 0
for (rr_ghgrp in 1:nrow(GHGRP)) {
  GHGRP_FACILITY_NAME <- tolower(GHGRP[rr_ghgrp, "GHGRP_FACILITY_NAME"])
  match_NEI_FACILITY_ID = 0
  for (rr_nei in 1:nrow(NEI_match0_pool)) {
    NEI_FACILITY_NAME <- tolower(NEI_match0_pool[rr_nei, "NEI_FACILITY_NAME"])
    if (GHGRP_FACILITY_NAME == NEI_FACILITY_NAME){
      match_NEI_FACILITY_ID = match_NEI_FACILITY_ID + 1
      #write down all the NEI NEI_FACILITY_ID that meet the matching criteria
      GHGRP[rr_ghgrp, sprintf("NEI_FACILITY_ID_%d", match_NEI_FACILITY_ID)] = NEI_match0_pool[rr_nei, "NEI_FACILITY_ID"]
    }
  }
  if (match_NEI_FACILITY_ID > max_mflg0_NEI_FACILITY_ID){
    max_mflg0_NEI_FACILITY_ID = match_NEI_FACILITY_ID
  }
}

##########################
#Distance Sanity Check (DSC)

#Get a subset of GHGRP that are match flag 0 pre-DSC
GHGRP_mflg0_preDSC = GHGRP %>% filter(NEI_FACILITY_ID_1 != 'NA')

#reorgainize the table to have each NEI_FACILITY_ID in a private row
#Isolating (max_mflg0_NEI_FACILITY_ID) subsets of GHGRP_mflg0_preDSC
#each subset only have one NEI_FACILITY_ID_* and rename it to NEI_FACILITY_ID
for(i in 1:max_mflg0_NEI_FACILITY_ID) { 
  NEI_FACILITY_ID_i <- paste("NEI_FACILITY_ID_", i, sep = "")
  GHGRP_mflg0_preDSC_NEI_FACILITY_IDi = subset(GHGRP_mflg0_preDSC, select=c("GHGRP_ID","GHGRP_FACILITY_NAME","LON_GHGRP","LAT_GHGRP",NEI_FACILITY_ID_i))
  colnames(GHGRP_mflg0_preDSC_NEI_FACILITY_IDi)[colnames(GHGRP_mflg0_preDSC_NEI_FACILITY_IDi) == NEI_FACILITY_ID_i] ="NEI_FACILITY_ID"
  #stack the subset copies
  if (i==1){
    GHGRP_mflg0_preDSC_flat_scratch = GHGRP_mflg0_preDSC_NEI_FACILITY_IDi
  } else{
    GHGRP_mflg0_preDSC_flat_scratch = rbind(GHGRP_mflg0_preDSC_flat_scratch, GHGRP_mflg0_preDSC_NEI_FACILITY_IDi)
  }
}

#remove rows with NEI_FACILITY_ID == NA #note:filter(!is.na(NEI_FACILITY_ID)) is the same as filter(NEI_FACILITY_ID != 'NA')
GHGRP_mflg0_preDSC_flat = GHGRP_mflg0_preDSC_flat_scratch %>% filter(!is.na(NEI_FACILITY_ID))
GHGRP_mflg0_DSC_flat = join(GHGRP_mflg0_preDSC_flat,NEI_match0_pool, by="NEI_FACILITY_ID", type="left")
for (rr_ghgrp in 1:nrow(GHGRP_mflg0_DSC_flat)) {
  GHGRP_mflg0_DSC_flat[rr_ghgrp, "distance"] = distm(c(GHGRP_mflg0_DSC_flat[rr_ghgrp, "LON_NEI"], GHGRP_mflg0_DSC_flat[rr_ghgrp, "LAT_NEI"]), c(GHGRP_mflg0_DSC_flat[rr_ghgrp, "LON_GHGRP"], GHGRP_mflg0_DSC_flat[rr_ghgrp, "LAT_GHGRP"]), fun = distHaversine)/1000 #distance unit: km
}

#check distribution of distance
dist_distance <- hist(GHGRP_mflg0_DSC_flat$distance, plot = FALSE)
plot(dist_distance, xlab = "distance GHGRP/NEI", ylab = "Counts", main = "match flag 0 DSC", col = "grey")
describe(GHGRP_mflg0_DSC_flat$distance)

#set an initial distance buffer
distance_buffer_bygrid = sqrt((4/2)**2 + (4/2)**2) #inspired by 4km grid

#if matched points are too far away (distance > distance_buffer_bygrid), remove them from GHGRP_mflg0_DSC_flat
GHGRP_mflg0_flat = GHGRP_mflg0_DSC_flat %>% filter(GHGRP_mflg0_DSC_flat$distance <= distance_buffer_bygrid)

#add match quality flag
GHGRP_mflg0_flat$match_quality_flag = 0 #matched by facility name and passed distance sanity check, no manual work required
GHGRP_mflg0_flat$manual_check_instruction = "no manual work required"

# Get a subset of GHGRP that are not matched after flag 0
GHGRP_mflg0_leftover = GHGRP %>% filter(!(GHGRP$GHGRP_ID %in% GHGRP_mflg0_flat$GHGRP_ID))
GHGRP_mflg0_leftover = subset(GHGRP_mflg0_leftover, select=c("GHGRP_ID","GHGRP_FACILITY_NAME","LON_GHGRP","LAT_GHGRP"))

write.csv(GHGRP_mflg0_flat, file = "GHGRP_refineries_mflg0_flat.csv")
write.csv(GHGRP_mflg0_leftover, file = "GHGRP_refineries_mflg0_leftover.csv")

#############################################################################################################################
#match_qual_flag = 1: mutually closest and distance sanity check          

##########################
#prepare NEI_match1_pool array
NEI_match1_pool = NEI_match0_pool %>% filter(!(NEI_match0_pool$NEI_FACILITY_ID %in% GHGRP_mflg0_flat$NEI_FACILITY_ID))

#build distance matrix
dis_matrix <- array(c(0), dim = c(nrow(GHGRP_mflg0_leftover), nrow(NEI_match1_pool)))
for (rr_ghgrp in 1:nrow(dis_matrix)) {
  for (cc_nei in 1:ncol(dis_matrix)) {
    dis_matrix[rr_ghgrp,cc_nei] = distm(c(NEI_match1_pool[cc_nei, "LON_NEI"], NEI_match1_pool[cc_nei, "LAT_NEI"]), c(GHGRP_mflg0_leftover[rr_ghgrp, "LON_GHGRP"], GHGRP_mflg0_leftover[rr_ghgrp, "LAT_GHGRP"]), fun = distHaversine)/1000 #distance unit: km
  }
}

##########################
#perform lat lon matching
max_mflg1_NEI_FACILITY_ID = 0
for (rr_ghgrp in 1:nrow(dis_matrix)) {
  #find the nearest NEI to current GHGRP
  closest_NEI = which(dis_matrix[rr_ghgrp,] == min(dis_matrix[rr_ghgrp,]))
  num_closest_NEI = length(closest_NEI)
  match_NEI_FACILITY_ID = 0
  for (ii in 1:num_closest_NEI) {
    cc_nei = closest_NEI[ii]
    closest_GHGRP = which(dis_matrix[,cc_nei] == min(dis_matrix[,cc_nei]))
    if (rr_ghgrp %in% closest_GHGRP){
      match_NEI_FACILITY_ID = match_NEI_FACILITY_ID + 1
      #write down all the NEI NEI_FACILITY_ID that meet the matching criteria
      GHGRP_mflg0_leftover[rr_ghgrp, sprintf("NEI_FACILITY_ID_%d", match_NEI_FACILITY_ID)] = NEI_match1_pool[cc_nei, "NEI_FACILITY_ID"]
    }
  }
  if (match_NEI_FACILITY_ID > max_mflg1_NEI_FACILITY_ID){
    max_mflg1_NEI_FACILITY_ID = match_NEI_FACILITY_ID
  }
}

##########################
#Distance Sanity Check (DSC)

#Get a subset of GHGRP_mflg0_leftover that are match flag 1
GHGRP_mflg1 = GHGRP_mflg0_leftover %>% filter(NEI_FACILITY_ID_1 != 'NA')

#reorgainize the table to have each NEI_FACILITY_ID in a private row
#Isolating (max_mflg1_NEI_FACILITY_ID) subsets of GHGRP_mflg1
#each subset only have one NEI_FACILITY_ID_* and rename it to NEI_FACILITY_ID
for(i in 1:max_mflg1_NEI_FACILITY_ID) { 
  NEI_FACILITY_ID_i <- paste("NEI_FACILITY_ID_", i, sep = "")
  GHGRP_mflg1_NEI_FACILITY_IDi = subset(GHGRP_mflg1, select=c("GHGRP_ID","GHGRP_FACILITY_NAME","LON_GHGRP","LAT_GHGRP",NEI_FACILITY_ID_i))
  colnames(GHGRP_mflg1_NEI_FACILITY_IDi)[colnames(GHGRP_mflg1_NEI_FACILITY_IDi) == NEI_FACILITY_ID_i] ="NEI_FACILITY_ID"
  #stack the subset copies
  if (i==1){
    GHGRP_mflg1_flat_scratch = GHGRP_mflg1_NEI_FACILITY_IDi
  } else{
    GHGRP_mflg1_flat_scratch = rbind(GHGRP_mflg1_flat_scratch, GHGRP_mflg1_NEI_FACILITY_IDi)
  }
}

#remove rows with NEI_FACILITY_ID == NA #note:filter(!is.na(NEI_FACILITY_ID)) is the same as filter(NEI_FACILITY_ID != 'NA')
GHGRP_mflg1_flat = GHGRP_mflg1_flat_scratch %>% filter(!is.na(NEI_FACILITY_ID))
GHGRP_mflg1_flat = join(GHGRP_mflg1_flat,NEI_match1_pool, by="NEI_FACILITY_ID", type="left")
for (rr_ghgrp in 1:nrow(GHGRP_mflg1_flat)) {
  GHGRP_mflg1_flat[rr_ghgrp, "distance"] = distm(c(GHGRP_mflg1_flat[rr_ghgrp, "LON_NEI"], GHGRP_mflg1_flat[rr_ghgrp, "LAT_NEI"]), c(GHGRP_mflg1_flat[rr_ghgrp, "LON_GHGRP"], GHGRP_mflg1_flat[rr_ghgrp, "LAT_GHGRP"]), fun = distHaversine)/1000 #distance unit: km
}

#check distribution of distance
dist_distance <- hist(GHGRP_mflg1_flat$distance, plot = FALSE)
plot(dist_distance, xlab = "distance GHGRP/NEI", ylab = "Counts", main = "match flag 1 DSC", col = "grey")
describe(GHGRP_mflg1_flat$distance)

#Update distance_buffer based on the experience learned in 1:1 perfect match
distance_buffer = max(GHGRP_mflg0_flat$distance)

#add match quality flag
for (rr_ghgrp in 1:nrow(GHGRP_mflg1_flat)) {
  if (GHGRP_mflg1_flat[rr_ghgrp, "distance"]<=distance_buffer){
    GHGRP_mflg1_flat[rr_ghgrp, "match_quality_flag"] = 1.1 #mutually closest and distance smaller than buffer, manually check facility names
    GHGRP_mflg1_flat[rr_ghgrp, "manual_check_instruction"] = "check facility name"
    }else{
    GHGRP_mflg1_flat[rr_ghgrp, "match_quality_flag"] = 1.2 #mutually closest but distance greater than buffer, manually check facility names
    GHGRP_mflg1_flat[rr_ghgrp, "manual_check_instruction"] = "evaluate distance, check facility name"
    }
}
    
# Get a subset of GHGRP_mflg0_leftover that are not matched after flag 1
GHGRP_mflg1_leftover = GHGRP_mflg0_leftover %>% filter(!(GHGRP_mflg0_leftover$GHGRP_ID %in% GHGRP_mflg1_flat$GHGRP_ID))
GHGRP_mflg1_leftover = subset(GHGRP_mflg1_leftover, select=c("GHGRP_ID","GHGRP_FACILITY_NAME","LON_GHGRP","LAT_GHGRP"))

write.csv(GHGRP_mflg1_flat, file = "GHGRP_refineries_mflg1_flat.csv")
write.csv(GHGRP_mflg1_leftover, file = "GHGRP_refineries_mflg1_leftover.csv")

#############################################################################################################################
#match_qual_flag = 2: GHGRP's closest NEI and distance sanity check
#after mutually closest search, if a GHGRP point cannot find a mutually closest NEI point,
#Just match the GHGRP's closest NEI to it, regardless of if the NEI's closest GHGRP is the current GHGRP

##########################
#prepare NEI_match2_pool array
NEI_match2_pool = NEI_match1_pool

#build distance matrix
dis_matrix <- array(c(0), dim = c(nrow(GHGRP_mflg1_leftover), nrow(NEI_match2_pool)))
for (rr_ghgrp in 1:nrow(dis_matrix)) {
  for (cc_nei in 1:ncol(dis_matrix)) {
    dis_matrix[rr_ghgrp,cc_nei] = distm(c(NEI_match2_pool[cc_nei, "LON_NEI"], NEI_match2_pool[cc_nei, "LAT_NEI"]), c(GHGRP_mflg1_leftover[rr_ghgrp, "LON_GHGRP"], GHGRP_mflg1_leftover[rr_ghgrp, "LAT_GHGRP"]), fun = distHaversine)/1000 #distance unit: km
  }
}

##########################
#perform lat lon matching
max_mflg2_NEI_FACILITY_ID = 0
for (rr_ghgrp in 1:nrow(dis_matrix)) {
  #find the nearest NEI to current GHGRP
  closest_NEI = which(dis_matrix[rr_ghgrp,] == min(dis_matrix[rr_ghgrp,]))
  num_closest_NEI = length(closest_NEI)
  match_NEI_FACILITY_ID = 0
  for (ii in 1:num_closest_NEI) {
    cc_nei = closest_NEI[ii]
    match_NEI_FACILITY_ID = match_NEI_FACILITY_ID + 1
    #write down all the NEI NEI_FACILITY_ID that meet the matching criteria
    GHGRP_mflg1_leftover[rr_ghgrp, sprintf("NEI_FACILITY_ID_%d", match_NEI_FACILITY_ID)] = NEI_match2_pool[cc_nei, "NEI_FACILITY_ID"]
    }
  if (match_NEI_FACILITY_ID > max_mflg2_NEI_FACILITY_ID){
    max_mflg2_NEI_FACILITY_ID = match_NEI_FACILITY_ID
  }
}

##########################
#Distance Sanity Check (DSC)

#Get a subset of GHGRP_mflg1_leftover that are match flag 2
GHGRP_mflg2 = GHGRP_mflg1_leftover %>% filter(NEI_FACILITY_ID_1 != 'NA')

#reorgainize the table to have each NEI_FACILITY_ID in a private row
#Isolating (max_mflg2_NEI_FACILITY_ID) subsets of GHGRP_mflg2
#each subset only have one NEI_FACILITY_ID_* and rename it to NEI_FACILITY_ID
for(i in 1:max_mflg2_NEI_FACILITY_ID) { 
  NEI_FACILITY_ID_i <- paste("NEI_FACILITY_ID_", i, sep = "")
  GHGRP_mflg2_NEI_FACILITY_IDi = subset(GHGRP_mflg2, select=c("GHGRP_ID","GHGRP_FACILITY_NAME","LON_GHGRP","LAT_GHGRP",NEI_FACILITY_ID_i))
  colnames(GHGRP_mflg2_NEI_FACILITY_IDi)[colnames(GHGRP_mflg2_NEI_FACILITY_IDi) == NEI_FACILITY_ID_i] ="NEI_FACILITY_ID"
  #stack the subset copies
  if (i==1){
    GHGRP_mflg2_flat_scratch = GHGRP_mflg2_NEI_FACILITY_IDi
  } else{
    GHGRP_mflg2_flat_scratch = rbind(GHGRP_mflg2_flat_scratch, GHGRP_mflg2_NEI_FACILITY_IDi)
  }
}

#remove rows with NEI_FACILITY_ID == NA #note:filter(!is.na(NEI_FACILITY_ID)) is the same as filter(NEI_FACILITY_ID != 'NA')
GHGRP_mflg2_flat = GHGRP_mflg2_flat_scratch %>% filter(!is.na(NEI_FACILITY_ID))
GHGRP_mflg2_flat = join(GHGRP_mflg2_flat,NEI_match2_pool, by="NEI_FACILITY_ID", type="left")
for (rr_ghgrp in 1:nrow(GHGRP_mflg2_flat)) {
  GHGRP_mflg2_flat[rr_ghgrp, "distance"] = distm(c(GHGRP_mflg2_flat[rr_ghgrp, "LON_NEI"], GHGRP_mflg2_flat[rr_ghgrp, "LAT_NEI"]), c(GHGRP_mflg2_flat[rr_ghgrp, "LON_GHGRP"], GHGRP_mflg2_flat[rr_ghgrp, "LAT_GHGRP"]), fun = distHaversine)/1000 #distance unit: km
}

#check distribution of distance
dist_distance <- hist(GHGRP_mflg2_flat$distance, plot = FALSE)
plot(dist_distance, xlab = "distance GHGRP/NEI", ylab = "Counts", main = "match flag 2 DSC", col = "grey")
describe(GHGRP_mflg2_flat$distance)

#add match quality flag
for (rr_ghgrp in 1:nrow(GHGRP_mflg2_flat)) {
  if (GHGRP_mflg2_flat[rr_ghgrp, "distance"]<=distance_buffer){
    GHGRP_mflg2_flat[rr_ghgrp, "match_quality_flag"] = 2.1 #GHGRP's closest NEI and distance smaller than buffer, manually check facility names
    GHGRP_mflg2_flat[rr_ghgrp, "manual_check_instruction"] = "check facility name (NEI might be already occupied)"
    }else{
    GHGRP_mflg2_flat[rr_ghgrp, "match_quality_flag"] = 2.2 #GHGRP's closest NEI but distance greater than buffer, manually check facility names
    GHGRP_mflg2_flat[rr_ghgrp, "manual_check_instruction"] = "evaluate distance, check facility name (NEI might be already occupied)"
    }
}

# Get a subset of GHGRP_mflg1_leftover that are not matched after flag 2
GHGRP_mflg2_leftover = GHGRP_mflg1_leftover %>% filter(!(GHGRP_mflg1_leftover$GHGRP_ID %in% GHGRP_mflg2_flat$GHGRP_ID))
#GHGRP_mflg2_leftover is and should be 0

write.csv(GHGRP_mflg2_flat, file = "GHGRP_refineries_mflg2_flat.csv")

#############################################################################################################################
#combine GHGRP_mflg0_flat, GHGRP_mflg1_flat, and GHGRP_mflg2_flat
GHGRP_NEI_match_flat = rbind(GHGRP_mflg0_flat, GHGRP_mflg1_flat, GHGRP_mflg2_flat)

#adding state, city, address info is helpful in manual checking
GHGRP_FULL_INFO <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_GHGRP_NEI_IND/spreadsheets/input/GHGRP_refineries_2017_CO2_FC.csv")
colnames(GHGRP_FULL_INFO)[colnames(GHGRP_FULL_INFO) == "FACILITY.NAME"] ="GHGRP_FACILITY_NAME"
colnames(GHGRP_FULL_INFO)[colnames(GHGRP_FULL_INFO) == "LONGITUDE"] ="LON_GHGRP"
colnames(GHGRP_FULL_INFO)[colnames(GHGRP_FULL_INFO) == "LATITUDE"] ="LAT_GHGRP"
colnames(GHGRP_FULL_INFO)[colnames(GHGRP_FULL_INFO) == "GHGRP.ID"] ="GHGRP_ID"

GHGRP_NEI_match_flat = join(GHGRP_NEI_match_flat, GHGRP_FULL_INFO, by = c("GHGRP_ID", "GHGRP_FACILITY_NAME", "LON_GHGRP", "LAT_GHGRP"))

#reorganize columns
GHGRP_NEI_match_flat = subset(GHGRP_NEI_match_flat, select=c("GHGRP_ID","NEI_FACILITY_ID","LON_GHGRP","LAT_GHGRP","LON_NEI","LAT_NEI","REPORTED.ADDRESS","CITY.NAME","COUNTY.NAME","STATE","PARENT.COMPANIES","distance","GHGRP_FACILITY_NAME","NEI_FACILITY_NAME","match_quality_flag","manual_check_instruction"))

write.csv(GHGRP_NEI_match_flat, file = "GHGRP_NEI_refineries_match_flat.csv")